import pandas as pd
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity
import os

# Load dataframe from CSV
print("Loading dataframe from CSV...")
df = pd.read_csv('mip_codes.csv')
print("Dataframe loaded successfully.")

# Initialize OpenAI API client
print("Initializing OpenAI API client...")
client = OpenAI(api_key=os.environ['OPENAI'])
print("OpenAI API client initialized.")

# Function to get embeddings
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    return response.data[0].embedding


# Function to check if issue is unique based on cosine similarity
def is_unique(issue_embedding, unique_embeddings, threshold):
    for unique_embedding in unique_embeddings:
        if cosine_similarity([issue_embedding],
                             [unique_embedding])[0][0] >= threshold:
            return False
    return True


# Thresholds for cosine similarity
thresholds = [0.99, 0.95, 0.90, 0.85, 0.80, 0.75, 0.70, 0.65]


# Function to process issues and filter unique ones
def filter_unique_issues(issues, threshold):
    unique_issues = []
    unique_embeddings = []

    for i, issue in enumerate(issues):
        print(
            f"Processing issue {i+1}/{len(issues)} for threshold {threshold}..."
        )
        issue_embedding = get_embedding(issue)
        if is_unique(issue_embedding, unique_embeddings, threshold):
            unique_issues.append(issue)
            unique_embeddings.append(issue_embedding)
            print(f"Issue '{issue}' is unique and added to the list.")
        else:
            print(f"Issue '{issue}' is not unique and not added to the list.")

    return unique_issues


# Extract personal issues
print("Extracting personal issues...")
personal_issues = df["personal_issue"].tolist()
print("Personal issues extracted successfully.")

# Filter unique issues for each threshold
print("Filtering unique issues for each threshold...")
unique_issues = {
    threshold: filter_unique_issues(personal_issues, threshold)
    for threshold in thresholds
}
print("Unique issues filtered successfully.")

# Convert to DataFrame
print("Converting unique issues to DataFrame...")
unique_issues_df = pd.DataFrame({
    "threshold":
    thresholds,
    "unique_personal_issues":
    [unique_issues[threshold] for threshold in thresholds]
})
print("Conversion to DataFrame successful.")

# Save to CSV
print("Saving unique issues to CSV...")
unique_issues_df.to_csv('unique_issues.csv', index=False)
print("Unique issues saved to 'unique_issues.csv'")
